
library(ncdf4)
library(ggplot2)
require(data.table)
library(dplyr)
library(plyr)
library(ggpubr) 
#require(tidyverse) #for easy data manipulation and visualization
#require(caret) #for easy machine learning workflow
#library(mgcv) #GAM

wd1 <- "C:\\Users\\YFan\\OneDrive - NORCE\\NFood_replication\\"

crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")
case = c("RCP45","RCP45_85LU","RCP85","RCP85_45CO2","RCP85_SAI","RCP85_MSB","RCP85_CCT")
casename = c("RCP45",expression('RCP45'[(SSP5LU)]),"RCP85",expression('RCP85'[(RCP45CO2)]),"RCP85+SAI","RCP85+MSB","RCP85+CCT")



#=============================prepare input data====================================
#load data prepared by Data1.GrowingSeasonClimate-Yield.R
load(paste0(wd1,"/data/RData/GridSample-GrSnClimate-Data-final.Rdata"))

#====use only years 2020-2099 data for all scenarios so the models are more comparable
df.cntl.all <- df.cntl.all[Year>=2020 & Year !=2100] 
df.scen.all <- df.scen.all[Year>=2020 & Year !=2100] 

#check object size
format(object.size(df.cntl.all), units="Gb")
format(object.size(df.scen.all), units="Gb")
df.cntl.all[Year==2090,sum(area),by=.(Scenario,Year)]
df.scen.all[Year==2090,sum(area),by=.(Scenario,Year)]


#crop failure under RCP85 and mitigation scenarios, yield=0 resulting in log(yield0)=-Inf or log(yield)=-Inf
cropfail.cntl <- df.cntl.all[, .(failure=sapply(log(yield0), is.infinite), Sample), by=.(Scenario,Crop,Irrig,Year)]
cropfail.scen <- df.scen.all[, .(failure=sapply(log(yield), is.infinite), Sample), by=.(Scenario,Crop,Irrig,Year)]
cropfail.cntl <- cropfail.cntl[, nfail:=length(unique(Sample[failure])), by=.(Scenario,Crop,Irrig,Year)]
cropfail.scen <- cropfail.scen[, nfail:=length(unique(Sample[failure])), by=.(Scenario,Crop,Irrig,Year)]
cropfail.cntl[, .(fail=mean(nfail/length(unique(Sample))*100)), by=.(Crop,Irrig)] #average failure rate in RCP85
cropfail.scen[, .(fail=mean(nfail/length(unique(Sample))*100)), by=.(Scenario,Crop,Irrig)] #3%-27% failure for rainfed crops; <1.5% for irrigated crops 
cropfail.cntl[, .(fail=mean(nfail/length(unique(Sample))*100))]
cropfail.scen[, .(fail=mean(nfail/length(unique(Sample))*100)), by=.(Scenario)]
#average crop failure rate is 3.5%for RCP85, RCP45 3.7% , SAI 3.5%, MSB 3.5%, CCT 3.4%
save(list=c("cropfail.cntl", "cropfail.scen"),file = paste0(wd1,"/data/RData/gridsample-cropfailure-data.Rdata"))
rm(cropfail.cntl, cropfail.scen)


#====Reviewer#3: why not use all data by setting the zero yield results to some arbitrarily low value instead (Log small number)
summary(df.cntl.all$yield1);summary(df.cntl.all$yield0);summary(df.scen.all$yield)
df.cntl.all[yield0>0, .(quantile(yield0, probs=c(0.01, 0.05, 0.1, 0.25, 0.5, 0.95))), by=.(Scenario,Crop,Irrig)]
df.cntl.all[, .(mean(yield0)), by=.(Scenario,Crop,Irrig)]
df.scen.all[yield>0 & Scenario=="sai", .(quantile(yield, probs=c(0.01, 0.05, 0.1))), by=.(Scenario,Crop,Irrig)]
df.scen.all[yield>0 & Scenario=="msb", .(quantile(yield, probs=c(0.01, 0.05, 0.1))), by=.(Scenario,Crop,Irrig)]
df.cntl.all[yield0>0 & Irrig==TRUE, .(quantile(yield0, probs=c(0.01, 0.05, 0.1, 0.5, 0.95))), by=.(Scenario,Crop,Irrig)]
df.cntl.all[yield0>0 & Irrig==FALSE, .(quantile(yield0, probs=c(0.05, 0.1, 0.15, 0.5, 0.95))), by=.(Scenario,Crop,Irrig)]
df.cntl.all[, .(mean(yield0)), by=.(Scenario,Crop,Irrig)] #mean ~3 tonnes C/ha for sugarcane
df.cntl.all[yield0>0 & Irrig==TRUE, .(quantile(yield0, probs=c(0.005))), by=.(Scenario,Crop)]
df.cntl.all[yield0>0 & Irrig==FALSE, .(quantile(yield0, probs=c(0.15))), by=.(Scenario,Crop)]
df.cntl.all[yield0>0, .(quantile(yield0, probs=c(0.05))), by=.(Scenario,Crop)]
df.scen.all[yield>0 & Scenario=="45", .(quantile(yield, probs=c(0.05))), by=.(Scenario,Crop)]
df.scen.all[yield>0 & Scenario=="sai", .(quantile(yield, probs=c(0.05))), by=.(Scenario,Crop,Irrig)]
df.scen.all[yield>0 & Scenario=="msb", .(quantile(yield, probs=c(0.05))), by=.(Scenario,Crop,Irrig)]
df.scen.all[yield>0 & Scenario=="cct", .(quantile(yield, probs=c(0.05))), by=.(Scenario,Crop,Irrig)]

log(0.00001);log(0.01);log(0.05);log(0.1);log(14); #log(yield) is near normal distribution 
#log normal distribution
yield.s<-sample(df.cntl.all[Crop=="Rice" & Irrig==FALSE]$yield0, 1000)
hist(yield.s, main = "Density plot of original yield", xlab = "yield", nclass=100, probability = TRUE) 
ggdensity(log(yield.s), main = "Density plot of log yield", xlab = "log (yield)") 


#=========Data preprocessing
#To prevent drastic changes in log(yield) due to zero or small values, e.g. log(0.001)-log(0.05), which is meaningless because both log(0.001) and log(0.05) indicate crop failure
#either filter these samples before regression, or set 0.05 or 0.1 tC/ha as the crop failure threshold or remove zero (log(0)=NaN) after differencing log(yield) of cntl and scen
df.cntl.new <- copy(df.cntl.all)
df.scen.new <- copy(df.scen.all)

#==set 5th percentile under RCP85 as the minimum yield value for each crop type
#crop area > 1000ha for all grid samples, set crop failure when yield <5th global percentile
#yld5th <- df.cntl.all[yield0>0 & Irrig==FALSE, .(yld5th=quantile(yield0, probs=c(0.05))), by=.(Scenario,Crop)] #5th percentile of rainfed cultivar (irrigated crops do not have zero or small yield values)
#~0.05 for wheat/cotton, ~0.1 for rice, ~0.2 for corn/sugarcane/soybean
yld5th <- df.cntl.new[yield0>0, .(yld5th=quantile(yield0, probs=c(0.05))), by=.(Scenario,Crop)] 
#~0.1 for wheat/cotton, ~0.2 for rice, ~0.3 for soybean, ~0.4 for corn/sugarcane

for (i in 1:8) {
  threshold <- yld5th$yld5th[i]
  crop <- yld5th$Crop[i]
  df.cntl.new <- df.cntl.new[yield0<=threshold & (Crop==crop), yield0:=threshold] #RCP85,  
  df.cntl.new <- df.cntl.new[yield1<=threshold & (Crop==crop), yield1:=threshold] #RCP85_45CO2, 
  df.scen.new <- df.scen.new[yield <=threshold & (Crop==crop), yield:=threshold] #RCP45_85LU,SAI/MSB/CCT
}
#NOTE: some grid cells that never grow a crop (yield0/yield1/yield < threshold across 80 years) should be removed from MLR because the response variable is always zero


df.cntl.new[, .(quantile(yield0, probs=c(0, 0.01, 0.05, 0.1))), by=.(Scenario,Crop,Irrig)]
df.scen.new[, .(quantile(yield, probs=c(0, 0.01, 0.05, 0.1))), by=.(Scenario,Crop,Irrig)]
df.cntl.new[,.(length(unique(Sample))),by=.(Scenario)] #all samples including crop failures (7060)
df.scen.new[,.(length(unique(Sample))),by=.(Scenario)] #same sample sizes across scenarios (will be filtered by nyear>=60 later)
summary(df.cntl.new$area);summary(df.scen.new$area) #minimal crop area > 1000ha
summary(df.cntl.new$yield1);summary(df.cntl.new$yield0);summary(df.scen.new$yield)



#==========if considering LUC effect (for supplementary figures)
#consider land use effect of ssp2-ssp5 (note they have different crop area weights in each grid cell)
length(unique(df.cntl.new$Sample));length(unique(df.scen.new$Sample)) #7060, same sample as selected from SSP5/RCP85: this sample mask will bias the RCP45 average yield
df.cntl.all[Year==2090,sum(area),by=.(Scenario,Year)]; df.scen.all[Year==2090,sum(area),by=.(Scenario,Year)]
df.scen.all[Year==2090,sum(area2),by=.(Scenario,Year)] #this area2 is only for estimating the confidence interval of luc.log in Boostrap resampling
df.cntl.new[Year==2090,sum(area),by=.(Scenario,Year)]
df.scen.new[Year==2090,sum(area),by=.(Scenario,Year)]
df.scen.new[Year==2090,sum(area2),by=.(Scenario,Year)]

#using original land use data from RCP45/SSP2 or select grid sample with its own mask to obtain correct global average when considering LUC effect!!
#load data prepared by Data1.GrowingSeasonClimate-Yield.R
load(paste0(wd1,"/data/Rdata/GridSample-SSP2-SSP5-Data-final.Rdata")) 
df.ssp2.all[Year==2090,sum(area),by=.(Scenario,Year)] #this is real yield and area data for RCP45, based on its own area mask (area>1000ha)

#Reviewer#3: If it is on the raw units does the fact that a 1mm/day precipitation change have a much larger impact in high latitudes matter? 
#Reviewer#3: Would normalizing by inter-annual variability change the results?
#Here, we use the standard deviation of yearly growing season mean precipitation simulated for the CNTL 21th Century to indicate the magnitude of the model's interannual variability.
df.cntl.new <- df.cntl.new[,tsa.sd:=sd(tsa.m), by=.(Scenario,Crop,Irrig,Sample)] #used to normalze by inter-annual variability at each grid before conducting per-grid MLR
df.cntl.new <- df.cntl.new[,rh.sd:=sd(rh.m), by=.(Scenario,Crop,Irrig,Sample)]   #or used to estimate effect of a unit change in P or RH
df.cntl.new <- df.cntl.new[,ppt.sd:=sd(ppt.m), by=.(Scenario,Crop,Irrig,Sample)] 
df.cntl.new <- df.cntl.new[,radd.sd:=sd(radd.m), by=.(Scenario,Crop,Irrig,Sample)]  
df.cntl.new <- df.cntl.new[,radi.sd:=sd(radi.m), by=.(Scenario,Crop,Irrig,Sample)] 
dif1 = df.cntl.new[df.scen.new[Scenario=="45"], 
               .(Scenario="RCP45 - RCP85", area.wt=area, 
                 yield0, yield1, yield,#base yield of RCP85, RCP85_45CO2, RCP45_SSP5LU, respectively (no need to convert to tonnes/ha)
                 yield.dif=log(i.yield)-log(yield1), #relative to RCP85_45CO2 (lower CO2 world)
                 co2.log =log(yield1) - log(yield0), #later add co2 effect to get RCP45_85LU-RCP85
                 luc.log = log(i.yield2)-log(i.yield), #RCP45-RCP45_85LU, note crop grid cells are not consistent between SSP2 and SSP5, resulting in INF values, later use global/regional average landuse data 
                 fertn.dif=i.fertn2-fertn, #note: SSP2 and SSP5 have different area.wt at each grid cell, better average yield and fertn to global and then differencing between SSP2 and SSP5
                 tsa.dif=i.tsa.m-tsa.m, rh.dif=i.rh.m-rh.m,   
                 ppt.dif=i.ppt.m-ppt.m, ppt.difn=(i.ppt.m-ppt.m)/ppt.sd, #normalized by inter-annual variability
                 radd.dif=i.radd.m-radd.m, radi.dif=i.radi.m-radi.m,
                 rh.log=log(i.rh.m)-log(rh.m), ppt.log=log(i.ppt.m)-log(ppt.m),
                 radd.log=log(i.radd.m)-log(radd.m), radi.log=log(i.radi.m)-log(radi.m), #add log change as alternative predictor
                 tsa.sd, rh.sd, ppt.sd, radd.sd, radi.sd), #for estimating the impact of stdv per variable
               by=.EACHI, on=.(Year,Crop,Irrig,Sample)]

dif2 = df.cntl.new[df.scen.new[Scenario!="45"],
               .(Scenario=i.Scenario, area.wt=area, yield0,yield1,yield, 
                 yield.dif=log(i.yield)-log(yield0), co2.log=0, #relative to RCP85
                 luc.log=0, fertn.dif=0,
                 tsa.dif=i.tsa.m-tsa.m, rh.dif=i.rh.m-rh.m,   
                 ppt.dif=i.ppt.m-ppt.m, ppt.difn=(i.ppt.m-ppt.m)/ppt.sd, #normalized by inter-annual variability
                 radd.dif=i.radd.m-radd.m, radi.dif=i.radi.m-radi.m,
                 rh.log=log(i.rh.m)-log(rh.m), ppt.log=log(i.ppt.m)-log(ppt.m),
                 radd.log=log(i.radd.m)-log(radd.m), radi.log=log(i.radi.m)-log(radi.m), #add log change as alternative predictor
                 tsa.sd, rh.sd, ppt.sd, radd.sd, radi.sd), #for estimating the impact of stdv per variable
               by=.EACHI, on=.(Year,Crop,Irrig,Sample)] 
dif2[Scenario=="sai",Scenario:="SAI - RCP85"]
dif2[Scenario=="msb",Scenario:="MSB - RCP85"]
dif2[Scenario=="cct",Scenario:="CCT - RCP85"]

#check sample sizes of different scenarios after differencing
length(unique(dif1$Sample));length(unique(dif2$Sample)) #7060 
df.cntl.new[,.(length(unique(Sample))), by=.(Scenario)]; 
df.scen.new[,.(length(unique(Sample))), by=.(Scenario)]
dif1[,.(length(unique(Sample))), by=.(Scenario)]; 
dif2[,.(length(unique(Sample))), by=.(Scenario)]




#=========all grid sample year data of different scenarios and crops
dif.8crop <- rbind(dif1,dif2)
is.na(dif.8crop) <- sapply(dif.8crop, is.infinite) #convert Inf (divided by 0) to NA, e.g. in luc.log
is.na(dif.8crop) <- sapply(dif.8crop, is.nan)      #convert NaN (missing) to NA
#set NA, so that they can be excluded in summary statistics by na.rm=TRUE

unique(dif.8crop$Year)
dif.8crop[,.(length(unique(Sample))), by=.(Scenario)]
dif.8crop[Year==2090,.(sum(area.wt)), by=.(Scenario)]
length(unique(dif.8crop$Crop));length(unique(dif.8crop$Year))
length(unique(dif.8crop[Crop=="Corn" & Irrig]$Scenario))
length(unique(dif.8crop[Crop=="Corn" & Irrig]$Sample)) #sample size not equal across years due to LUC
Cropgrids <- dif.8crop[,.(length(unique(Sample))), by=.(Scenario,Crop,Irrig)]
sum(Cropgrids[Scenario=="RCP45 - RCP85",]$V1) # 27913 unique cropgrid samples for each scenario
sum(Cropgrids[Scenario=="SAI - RCP85",]$V1)


#==check data distribution
#Lobell et al. 2011: yields follow a log-normal distribution and yield variance stays comparable in relative, but not absolute terms. 
#Transforming to log yields results in a more normally distributed variable.
#A log specification is also preferable because it assumes that a given change in temperature results in the same percent impact.
#Different countries have vastly different average yields that can vary by an order of magnitude.

#change in log yield (response variable) is normally distributed for each crop under each scenario
dif.s<-sample(dif.8crop$yield.dif, 1000) #log yield.dif normally distributed
ggdensity(dif.s, main = "Density plot of log(yield) change", xlab = "change in log (yield)")
dif.s<-sample(dif.8crop[Crop=="Soybean" & Irrig==FALSE]$yield.dif, 1000) #log yield.dif normally distributed
ggdensity(dif.s, main = "Density plot of log(yield) change", xlab = "change in log (yield)")
dif.s<-sample(dif.8crop[Scenario=="SAI - RCP85" & Crop=="Wheat" & Irrig==FALSE]$yield.dif, 1000) 
ggdensity(dif.s, main = "Density plot of log(yield) change", xlab = "change in log (yield)") #log yield.dif normally distributed



#==save all climate and yield change data for analysis (setting zeros and small yields to 5th percentile of each crop type under RCP85 to have valid log(yield) for all crop-grids)
saveRDS(dif.8crop, file = paste0(wd1,"/data/RData/allclimate-yield-changedata-MLR.Rds")) 

rm(df.scen.all,df.cntl.all)
rm(df.ssp2.all, df.ssp5.all)
rm(df.scen.new,df.cntl.new)
rm(dif1,dif2)


#(not necessary) normalize the change of climate variables (VIF remains the same)
# dif.8crop.norm <- dif.8crop[,.(tsa.dif=(tsa.dif-mean(tsa.dif))/sd(tsa.dif),#tmx.dif=(tmx.dif-mean(tmx.dif))/sd(tmx.dif),tmn.dif=(tmn.dif-mean(tmn.dif))/sd(tmn.dif),
#                                radd.dif=(radd.dif-mean(radd.dif))/sd(radd.dif),radi.dif=(radi.dif-mean(radi.dif))/sd(radi.dif),#Rg.n=(Rg-mean(Rg))/sd(Rg),Rdf.n=(Rdf-mean(Rdf))/sd(Rdf),
#                                ppt.dif=(ppt.dif-mean(ppt.dif))/sd(ppt.dif),rh.dif=(rh.dif-mean(rh.dif))/sd(rh.dif),#wind.n=(wind.m-mean(wind.m))/sd(wind.m),sw.n=(sw.m-mean(sw.m))/sd(sw.m),
#                                co2.log,luc.log,yield.dif,yield0,yield1,yield,area.wt,Sample,Year), by=.(Scenario, Crop, Irrig)]
# 
# dif.8crop[Scenario=="RCP45 - RCP85",cor.test(tsa.dif, rh.dif), by=.(Scenario, Crop, Irrig)]
# dif.8crop.norm[Scenario=="RCP45 - RCP85",cor.test(tsa.dif, rh.dif), by=.(Scenario, Crop, Irrig)]

#==Standardization doesn't change the collinearity between the main effects (linear terms) at all. Scaling doesn't either. Any linear transform won't do that. 
#what it changes is the correlation between main effects and their interactions or between linear and its power terms (e.g., T and T^2) and the intercept.
#Standardization also does not change the prediction, RMSE, R-squared value, Adjusted R-squared value, p-value of coefficients.
#so it does not improve MLR at all.
#see https://www.listendata.com/2017/04/how-to-standardize-variable-in-regression.html
#or https://stats.stackexchange.com/questions/16710/does-standardising-independent-variables-reduce-collinearity






#====select grid samples that exist more than 60 years during 2020-2099 for MLR at each grid cell
dif.8crop[, .(nsample=length(unique(Sample))), by=.(Crop,Irrig,Year)] #samples per crop per year (not all samples exist in all years due to transient land use)
sampleyear <- dif.8crop[, .(nyears=length(unique(Year))), by=.(Scenario,Crop,Irrig,Sample)] #occurance of years for each sample per crop
samplecrop <- sampleyear[nyears>=60] #only use those samples existing across more than 60 or all 80 years for each crop 
length(unique(sampleyear$Sample)) - length(unique(samplecrop$Sample)) #118 (<2%) unique grid samples removed when nyears>=70

dif.8crop.s <- dif.8crop[samplecrop, on=.(Scenario,Crop,Irrig,Sample)]  #not all scenarios have the same number of samples 
dif.8crop.s[, .(nsample=length(unique(Sample))), by=.(Crop,Irrig,Year)] #now similar samplesize across years per crop
dif.8crop.s[, .(nyears=length(unique(Year))), by=.(Crop,Irrig,Sample)]  #each sample has at least 60 years of data

#dif.8crop.s <- dif.8crop.s[order(Year)] #reorder the data by year
unique(dif.8crop.s$Year) 

#NOTE: some grid cells that never grow a crop (yield.dif=0 across 80 years) should be removed from MLR because the response variable is always zero
zerodif <- dif.8crop.s[,.(bad = mean(yield.dif)==0), by=.(Scenario,Crop,Irrig,Sample)]
badcells <- zerodif[bad==TRUE] #all from rainfed crop grid cells
badcells[Irrig==TRUE]
dif.8crop.s[Scenario=="SAI - RCP85" & Crop=="Corn" & Irrig==FALSE & Sample==18088]$yield.dif #bad cells have response variable yield.dif=0!
length(unique(badcells$Sample)) #970 bad cells because crop failures remain the same between SG(ER) and RCP85 in all 80 years

dif.8crop.s2 <- dif.8crop.s[!(Sample%in%badcells$Sample) & Irrig==FALSE,] #remove the bad cells (only for rainfed crops)
dif.8crop.s <- rbind(dif.8crop.s[Irrig==TRUE], dif.8crop.s2)
rm(dif.8crop.s2, zerodif)

#======summary of MLR data (grid sample size etc.)
length(unique(dif.8crop.s$Sample)) #6641 unique grids
Cropgrids <- dif.8crop.s[, .(ncropgrid=length(unique(Sample))), by=.(Scenario,Crop,Irrig)]  #sample size per crop (crop-grids)
sum(Cropgrids[Scenario=="RCP45 - RCP85",]$ncropgrid) #23928 unique per-crop-grid MLRs for each scenario
sum(Cropgrids[Scenario=="SAI - RCP85",]$ncropgrid)   
#after removing bad cells, 6641 unique grid cells contributing to 23928 crop-grid specific MLR regressions, each MLR with sample size > 60 (years)  


#====save data for per-grid-cell MLR analysis
saveRDS(dif.8crop.s, file = paste0(wd1,"/data/RData/climate-yield-changedata-MLR-selected.Rds")) 
rm(dif.8crop)
rm(samplecrop,sampleyear,Cropgrids)






